from AuxLibraries import *
import pandas as pd
import GWPs
import local_utils as ut
import sector_maps as smap
ONEDRIVE = '/Users/simonecenci/OneDrive - Imperial College London/'

#%%
def get_info_data():
    # Compustat for merging
    cmp_full = pd.read_csv('local_data/Compustat_Data_0624.csv')
    cmp = cmp_full[['conml', 'conm', 'ids', 'gvkey', 'loc', 'GICS_level_1', 'GICS_level_2']].groupby('gvkey').last().reset_index()
    return cmp_full, cmp

def get_cdp_data():
    dt = pd.read_csv('CDP_data_0524_v7.csv', low_memory=False).drop(columns = ['Unnamed: 0'])
    dt['Country'] = dt['Country'].replace(GWPs.get_country_name())
    dt = dt.loc[dt.cdp_id.dropna().index].reset_index(drop=True)
    dt['mrg_'] = dt['cdp_id'].astype(int).astype(str)+'-'+dt['Gas']+'-'+dt['ReportingYear'].astype(str)
    dt = dt.groupby('mrg_').last().reset_index(drop=True)   
    return dt

def light_merge(dt, country_merge=False):
    s = pd.read_csv('local_data/Firms_info_matching.csv').rename(columns = {'GICS Sector Name': 'GICS_level_1',
                                                                                   'Company Common Name': 'Organisation',
                                                                                   'Exchange Ticker': 'Ticker'})
    
    s['org_clean'] = ut.utils().clean_name(s.Organisation)
    s = s.loc[s.ISIN.dropna().index].reset_index(drop=True)
    s['GICS_level_1'] = s['GICS_level_1'].replace({'Industrials': 'Industrial',
                                                   'Materials': 'Material',
                                                   'Financials': 'Financial',
                                                   'Consumer Staples': 'Consumer Staple'})
    s = s.loc[s.GICS_level_1.dropna().index].reset_index(drop=True)
    
    #===
    dx = dt.copy()
    print('Starting point:', len(dx.cdp_id.unique()))
    dx['Country_adj'] = dx['Country'].replace(GWPs.get_country_name())
    ciso = pd.read_excel('local_data/country_iso.xlsx')
    isor = pd.read_excel('local_data/iso_region.xlsx')
    isor['Region'] = isor['Region'].apply(lambda x: x.split('+ \xa0')[1])
    ciso = ciso[['Country', 'ISO-3']]
    ciso.columns = ['Country_adj', 'loc' ]
    ciso = isor.merge(ciso)
    dx = dx.merge(ciso, on = ['Country_adj'])
    dx['mrg_'] = dx['cdp_id'].astype(int).astype(str)+'-'+dx['Gas']+'-'+dx['ReportingYear'].astype(str)
    dx = dx.groupby('mrg_').last().reset_index(drop=True)
    
    print('Mid point 1:', len(dx.cdp_id.unique()))
    
    dx['GEOGRAPHY'] = dx['Region'].replace(['Northern Europe', 'Eastern Europe',  'Western Europe', 'Southern Europe'], ['Europe']*4).replace(\
                                           ['Western Africa', 'Eastern Africa', 'Northern Africa', 'Southern Africa', 'Middle Africa'], ['Africa']*5).replace(\
                                           ['Central America', 'South America', 'Caribbean'], ['Latin America']*3).replace(\
                                           ['Southern Asia', 'South-eastern Asia', 'Eastern Asia', 'Australia and New Zealand', 'Micronesia', 'Polynesia', 'Melanesia'], ['Asia-Pacific']*7  ).replace(\
                                           ['Western Asia', 'Central Asia'], ['Middle-East']*2  ).replace(\
                                           ['Northern America'], ['North America'])
                                                                                                                                                                    
    dx['SP_GEOGRAPHY'] = dx['GEOGRAPHY'].replace(['Latin America','Africa','Middle-East'], ['Rest of the world']*3)
    dx = dx.loc[dx.SP_GEOGRAPHY.dropna().index].reset_index(drop=True)
    print('Mid point 2:', len(dx.cdp_id.unique()))
    #== Make sector
    dxs, secMAP = smap.get_sector_map_step1(dx)
    dxs, secMAP2 = smap.get_sector_map_step2(dxs)

    print('End point:', len(dxs.cdp_id.unique()))
    return dxs
def get_isin_gvkey_cdp_mapping(dt, cmp, s):
    isin1=[]
    isin2=[]
    for i in range(len(dt)):
        if dt['ISIN'].iloc[i]!=None:
            if ',' in dt['ISIN'].iloc[i]:
                isin1.append(dt['ISIN'].iloc[i].split(',')[0])
                isin2.append(dt['ISIN'].iloc[i].split(',')[1].strip())
            else:
                isin1.append(dt['ISIN'].iloc[i])
                isin2.append(None)
        else:
            isin1.append(dt['ISIN'].iloc[i])
            isin2.append(None)
            
    dt['ISIN_1'] = isin1
    dt['ISIN_2'] = isin2
    
    a = dt[['cdp_id', 'ISIN']].dropna().groupby('ISIN').last()
    b = dt[['cdp_id', 'ISIN_1']].dropna().groupby('ISIN_1').last()
    c = dt[['cdp_id', 'ISIN_2']].dropna().groupby('ISIN_2').last()
    
    l = pd.concat((a, b, c)).reset_index().rename(columns = {'index': 'ISIN'})
    c1 = l.merge(cmp[['gvkey', 'ids']].rename(columns = {'ids': 'ISIN'}))
    c1 = c1.groupby('cdp_id').last().reset_index()
    
    c2 = l.merge(s[['gvkey', 'ISIN']])
    c2 = c2.groupby('cdp_id').last().reset_index()
    c2 = c2[c2.gvkey.isin(c1.gvkey) == False]
    z = pd.concat((c1,c2))
    
    return z

def primary_merge(dt,  cmp):
    s = pd.read_csv('local_data/Firms_info_matching.csv').rename(columns = {'GICS Sector Name': 'GICS_level_1',
                                                                                   'Company Common Name': 'Organisation',
                                                                                   'Exchange Ticker': 'Ticker'})
    
    s['org_clean'] = ut.utils().clean_name(s.Organisation)
    s = s.loc[s.ISIN.dropna().index].reset_index(drop=True)
    s['GICS_level_1'] = s['GICS_level_1'].replace({'Industrials': 'Industrial',
                                                   'Materials': 'Material',
                                                   'Financials': 'Financial',
                                                   'Consumer Staples': 'Consumer Staple'})
    s = s.loc[s.GICS_level_1.dropna().index].reset_index(drop=True)
    

    #===
    dx = dt.copy()
    dx = dx[dx.Gas.isin(['CO2', 'CH4'])]
    print('Starting point:', len(dx.cdp_id.unique()))
    dx['Country_adj'] = dx['Country'].replace(GWPs.get_country_name())
    ciso = pd.read_excel('local_data/country_iso.xlsx')
    isor = pd.read_excel('local_data/iso_region.xlsx')
    isor['Region'] = isor['Region'].apply(lambda x: x.split('+ \xa0')[1])
    ciso = ciso[['Country', 'ISO-3']]
    ciso.columns = ['Country_adj', 'loc' ]
    ciso = isor.merge(ciso)
    #== Loosing companies with no country
    dx = dx.merge(ciso, on = ['Country_adj'])
    
    print('Merging countries:', len(dx.cdp_id.unique()))
    ## Merge with Compustat
    mapping = get_isin_gvkey_cdp_mapping(dt, cmp, s)
    mapping = cmp[['gvkey', 'GICS_level_1', 'GICS_level_2', 'loc', 'ids']].merge(mapping)
    dx1 = dx.merge(mapping[['gvkey', 'cdp_id', 'GICS_level_1', 'GICS_level_2', 'loc', 'ids']])
    dx2 = dx.merge(cmp[cmp.gvkey.isin(dx1.gvkey) == False][['ids', 'gvkey', 'GICS_level_1', 'GICS_level_2']].dropna().rename(columns = {'ids':'ISIN'}))

    dx['org_clean'] = ut.utils().clean_name(dx.Organisation)  
    cmp_step2 = cmp[cmp.gvkey.isin(dx1.gvkey) == False].rename(columns = {'conml':'Organisation'})
    cmp_step2 = cmp_step2[cmp_step2.gvkey.isin(dx2.gvkey) == False].rename(columns = {'conml':'Organisation'})   
    cmp_step2['org_clean'] = ut.utils().clean_name(cmp_step2.Organisation)
    dx3 = dx.merge(cmp_step2[['org_clean', 'gvkey', 'GICS_level_1']].dropna(), on = 'org_clean')

    dxy = pd.concat((dx1, dx2, dx3))
    cmp_step3 = cmp[cmp.gvkey.isin(dxy.gvkey) == False].rename(columns = {'conm':'Organisation'})
    cmp_step3['org_clean'] = ut.utils().clean_name(cmp_step3.Organisation)
    dx4 = dx.merge(cmp_step3[['org_clean', 'gvkey', 'GICS_level_1']].dropna(), on = 'org_clean')

    dxy = pd.concat((dxy,  dx4))
    print('Merging compustat:', len(dxy.cdp_id.unique()))
    ## Use refinitiv
    g = dx[dx.cdp_id.isin(dxy.cdp_id.unique()) == False ]
    g1 = g.merge(s[['org_clean', 'GICS_level_1', 'gvkey']], on = 'org_clean')
    dxy = pd.concat((dxy, g1))
    print('Merging refinitiv (1):', len(dxy.cdp_id.unique()))
    #=====
    g =dx[dx.cdp_id.isin(dxy.cdp_id.unique()) == False ]
    g1 = g.merge(s[['ISIN', 'GICS_level_1', 'gvkey']], on = 'ISIN')
    dxy = pd.concat((dxy, g1))
    print('Merging refinitiv (1):', len(dxy.cdp_id.unique()))
    dx = dxy.copy()
    dx['mrg_'] = dx['cdp_id'].astype(int).astype(str)+'-'+dx['Gas']+'-'+dx['ReportingYear'].astype(str)
    dx = dx.groupby('mrg_').last().reset_index(drop=True)
    
    dx['GEOGRAPHY'] = dx['Region'].replace(['Northern Europe', 'Eastern Europe',  'Western Europe', 'Southern Europe'], ['Europe']*4).replace(\
                                           ['Western Africa', 'Eastern Africa', 'Northern Africa', 'Southern Africa'], ['Africa']*4).replace(\
                                           ['Central America', 'South America', 'Caribbean'], ['Latin America']*3).replace(\
                                           ['Southern Asia', 'South-eastern Asia', 'Eastern Asia', 'Australia and New Zealand', 'Micronesia'], ['Asia-Pacific']*5  ).replace(\
                                           ['Western Asia', 'Central Asia'], ['Middle-East']*2  ).replace(\
                                           ['Northern America'], ['North America'])
                                                                                                                                                                    
    dx['SP_GEOGRAPHY'] = dx['GEOGRAPHY'].replace(['Latin America','Africa','Middle-East'], ['Rest of the world']*3)
    dx = dx.loc[dx.SP_GEOGRAPHY.dropna().index].reset_index(drop=True)
    print('Final:', len(dx.cdp_id.unique()))
    return dx

def add_exogeneous(H):
    G = H.copy()
    policies =pd.read_csv('local_data/env_policies.csv').rename(columns = {'index': 'mrg_cp'})
    gdp = pd.read_csv('local_data/GDP.csv').drop(columns = ['Unnamed: 0'])
    com_prices = pd.read_excel('local_data/annual_index_nominal.xlsx').rename(columns = {'Year': 'fyear'})
    policies['country'] = policies['mrg_cp'].apply(lambda x: x.split('-')[1])
    policies['rfyear'] = policies['mrg_cp'].apply(lambda x: x.split('-')[0])

    policies = policies.sort_values(by = ['country', 'rfyear'])
    policies['EnvPolicies'] =  \
        policies[['country', 'EnvPolicies']].groupby('country').rolling(3).mean().values.ravel().tolist()

    G['fyear'] = G['ReportingYear'] - 1
    G['mrg_cp'] = G['fyear'].astype(int).astype(str)+'-'+G['loc']
    G = G.merge(policies, on = 'mrg_cp')
    G = G.merge(gdp, on = 'mrg_cp')
    G = G.merge(com_prices, on = 'fyear')
    
    exogeneous = list(policies.columns[1:])+\
        list(gdp.columns[1:]) + \
                list(com_prices.columns[1:])

    return G, exogeneous
def add_alternative():
    epi = pd.read_csv('local_data/PNRGINDEXM.csv')
    epi['DATE'] = pd.to_datetime(epi['DATE'])
    epi = epi.set_index('DATE')
    epi = epi.resample('Y').mean()
    epi['ReportingYear'] = epi.index.year
    epi.columns = ['Energy Price Index', 'ReportingYear']
    return(epi)
def get_GWP_table(Z):
    T  =  Z.copy()
    T['Gas'] = T['mrg'].apply(lambda x: x.split('-')[0])
    T['Report'] = T['mrg'].apply(lambda x: x.split('-')[1])
    T['Horizon'] = T['mrg'].apply(lambda x: x.split('-')[2])
    T = T.drop(columns = 'mrg')
    T = T[T.Gas == 'CH4'].drop(columns = 'Gas')
    T = T.pivot('Report', 'Horizon', 'ConversionFactor')
    T = T.loc[['SAR', 'TAR', 'AR4', 'AR5', 'AR6']]
    T = T[['20', '100']]
    T.columns = ['20-year', '100-year']
    return T

def get_Scope1():
    dt = get_cdp_data()
    dt['mrg'] = dt['cdp_id'].astype(str)+'-'+dt['Gas']+'-'+dt['ReportingYear'].astype(str)
    dt = dt.groupby('mrg').last().reset_index(drop=True)

    X = dt[dt['Value (tCO2e)'].isin(['Question not applicable', 'Hidden Answer']) == False]
    Scope1 = X[['cdp_id', 'ReportingYear', 'Value (tCO2e)']].astype(float).groupby(['cdp_id', 'ReportingYear']).sum().reset_index()
    Scope1 = Scope1.rename(columns = {'Value (tCO2e)': 'Scope_1'})
    Scope1['mrg'] = Scope1['cdp_id'].astype(int).astype(str)+'-'+Scope1['ReportingYear'].astype(int).astype(str)
    return Scope1
def get_total_by_gas(GASTYPE='CH4'):
    dt = get_cdp_data()
    dt['mrg'] = dt['cdp_id'].astype(str)+'-'+dt['Gas']+'-'+dt['ReportingYear'].astype(str)
    dt = dt.groupby('mrg').last().reset_index(drop=True)

    X = dt[dt['Value (tCO2e)'].isin(['Question not applicable', 'Hidden Answer']) == False]
    if GASTYPE != 'Scope_1':
        X = X[X.Gas == GASTYPE]
    G = X[['cdp_id', 'ReportingYear', 'Value (tCO2e)']].astype(float).groupby(['cdp_id', 'ReportingYear']).sum().reset_index()
    G = G.rename(columns = {'Value (tCO2e)': GASTYPE})
    G['mrg'] = G['cdp_id'].astype(int).astype(str)+'-'+G['ReportingYear'].astype(int).astype(str)
    
    return G
def full_scope1_emissions():
    '''
    Some companies have "Hidden Answers" in their disclosure of the fiscal year
    For those companies we estimate Scope 1 emissions as the sum of the individual GHGs
    '''
    M1 = pd.read_csv('local_data/scope1_emissions.csv')
    M1['mrg'] = M1['cdp_id'].astype(int).astype(str)+'-'+M1['ReportingYear'].astype(int).astype(str)
    M2 = get_total_by_gas('Scope_1').rename(columns = {"Scope_1": "Scope 1"})
    M2['mrg'] = M2['cdp_id'].astype(int).astype(str)+'-'+M2['ReportingYear'].astype(int).astype(str)
    M2 = M2[M2.mrg.isin(M1.mrg) == False]
    M1 = pd.concat((M1[['mrg', 'cdp_id', 'ReportingYear', 'Scope 1']],M2[['mrg', 'cdp_id', 'ReportingYear', 'Scope 1']]))
    M1 = M1.groupby('mrg').last().reset_index()
    
    return M1

def get_bootstrapped_errors(x):
    res = []
    for i in range(1000):
        s = x.loc[np.random.choice(x.index, int(0.9*len(x)), replace=True)]
        res.append(s.mean())
    res = np.sort(res)
    ub = res[975]
    lb = res[25]
    return np.mean(res),lb,ub

